In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import *
import plotly.express as px
In [2]:
hr_dataset = pd.read_csv("HR_comma_sep (1).csv")
hr_dataset
Out[2]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low
... ... ... ... ... ... ... ... ... ... ...
14994 0.40 0.57 2 151 3 0 1 0 support low
14995 0.37 0.48 2 160 3 0 1 0 support low
14996 0.37 0.53 2 143 3 0 1 0 support low
14997 0.11 0.96 6 280 4 0 1 0 support low
14998 0.37 0.52 2 158 3 0 1 0 support low

14999 rows × 10 columns

In [3]:
hr_dataset.shape
Out[3]:
(14999, 10)

Preprocessing

In [4]:
# Lets first check if any rows/columns have any missing values

def display_missing(df):    
    for col in df.columns.tolist():   
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))   
    print('\n')
    
display_missing(hr_dataset)
satisfaction_level column missing values: 0
last_evaluation column missing values: 0
number_project column missing values: 0
average_montly_hours column missing values: 0
time_spend_company column missing values: 0
Work_accident column missing values: 0
left column missing values: 0
promotion_last_5years column missing values: 0
sales column missing values: 0
salary column missing values: 0


In [5]:
# we realise the sales column is named incorrectly, perhaps a trick to test my attention! I will rename this to "department"
# and monthly has been spelled wrong

hr_dataset.rename(columns = {'sales':'department','average_montly_hours':'average_monthly_hours'}, inplace = True) 
hr_dataset
Out[5]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years department salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low
... ... ... ... ... ... ... ... ... ... ...
14994 0.40 0.57 2 151 3 0 1 0 support low
14995 0.37 0.48 2 160 3 0 1 0 support low
14996 0.37 0.53 2 143 3 0 1 0 support low
14997 0.11 0.96 6 280 4 0 1 0 support low
14998 0.37 0.52 2 158 3 0 1 0 support low

14999 rows × 10 columns

In [6]:
# No missing values, so we can progress
# Lets have a quick look into some stats of the data

hr_dataset.describe()
Out[6]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years
count 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000
mean 0.612834 0.716102 3.803054 201.050337 3.498233 0.144610 0.238083 0.021268
std 0.248631 0.171169 1.232592 49.943099 1.460136 0.351719 0.425924 0.144281
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000
50% 0.640000 0.720000 4.000000 200.000000 3.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000
In [7]:
hr_dataset.corr()

# the strongest reasons for leaving are correlated with satisfaction, time_spend_company and work accidents
Out[7]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years
satisfaction_level 1.000000 0.105021 -0.142970 -0.020048 -0.100866 0.058697 -0.388375 0.025605
last_evaluation 0.105021 1.000000 0.349333 0.339742 0.131591 -0.007104 0.006567 -0.008684
number_project -0.142970 0.349333 1.000000 0.417211 0.196786 -0.004741 0.023787 -0.006064
average_monthly_hours -0.020048 0.339742 0.417211 1.000000 0.127755 -0.010143 0.071287 -0.003544
time_spend_company -0.100866 0.131591 0.196786 0.127755 1.000000 0.002120 0.144822 0.067433
Work_accident 0.058697 -0.007104 -0.004741 -0.010143 0.002120 1.000000 -0.154622 0.039245
left -0.388375 0.006567 0.023787 0.071287 0.144822 -0.154622 1.000000 -0.061788
promotion_last_5years 0.025605 -0.008684 -0.006064 -0.003544 0.067433 0.039245 -0.061788 1.000000
In [8]:
ppp = hr_dataset.corr()['left'].reset_index().rename(columns = {'index': 'variable', 'left':'left correlation'}) 
ppp = ppp.loc[hr_dataset.corr()['left'].reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]
ppp
Out[8]:
variable left correlation
0 satisfaction_level -0.388375
5 Work_accident -0.154622
4 time_spend_company 0.144822
3 average_monthly_hours 0.071287
7 promotion_last_5years -0.061788
2 number_project 0.023787
1 last_evaluation 0.006567
In [9]:
# lets label encode salary column since its ordinal
# lets onehotencode department colunn since they are not related to each other

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

hr_dataset_encoded = hr_dataset.copy()
hr_dataset_encoded["salary"] = le.fit_transform(hr_dataset_encoded["salary"])
hr_dataset_encoded = pd.concat([hr_dataset_encoded, pd.get_dummies(hr_dataset_encoded['department'], prefix = "department")], axis = 1).drop(['department'],axis = 1)
hr_dataset_encoded
Out[9]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years salary department_IT department_RandD department_accounting department_hr department_management department_marketing department_product_mng department_sales department_support department_technical
0 0.38 0.53 2 157 3 0 1 0 1 0 0 0 0 0 0 0 1 0 0
1 0.80 0.86 5 262 6 0 1 0 2 0 0 0 0 0 0 0 1 0 0
2 0.11 0.88 7 272 4 0 1 0 2 0 0 0 0 0 0 0 1 0 0
3 0.72 0.87 5 223 5 0 1 0 1 0 0 0 0 0 0 0 1 0 0
4 0.37 0.52 2 159 3 0 1 0 1 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14994 0.40 0.57 2 151 3 0 1 0 1 0 0 0 0 0 0 0 0 1 0
14995 0.37 0.48 2 160 3 0 1 0 1 0 0 0 0 0 0 0 0 1 0
14996 0.37 0.53 2 143 3 0 1 0 1 0 0 0 0 0 0 0 0 1 0
14997 0.11 0.96 6 280 4 0 1 0 1 0 0 0 0 0 0 0 0 1 0
14998 0.37 0.52 2 158 3 0 1 0 1 0 0 0 0 0 0 0 0 1 0

14999 rows × 19 columns

In [85]:
hr_dataset_encoded.describe().iloc[1:2,].melt().rename(columns = ({'value':'mean'}))
Out[85]:
variable mean
0 satisfaction_level 0.612834
1 last_evaluation 0.716102
2 number_project 3.803054
3 average_monthly_hours 201.050337
4 time_spend_company 3.498233
5 Work_accident 0.144610
6 left 0.238083
7 promotion_last_5years 0.021268
8 salary 1.347290
9 department_IT 0.081805
10 department_RandD 0.052470
11 department_accounting 0.051137
12 department_hr 0.049270
13 department_management 0.042003
14 department_marketing 0.057204
15 department_product_mng 0.060137
16 department_sales 0.276018
17 department_support 0.148610
18 department_technical 0.181345
In [11]:
hr_dataset_encoded.corr()
Out[11]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years salary department_IT department_RandD department_accounting department_hr department_management department_marketing department_product_mng department_sales department_support department_technical
satisfaction_level 1.000000 0.105021 -0.142970 -0.020048 -0.100866 0.058697 -0.388375 0.025605 0.011754 0.006373 0.006615 -0.028649 -0.012841 0.007172 0.005715 0.006919 0.004007 0.009185 -0.009345
last_evaluation 0.105021 1.000000 0.349333 0.339742 0.131591 -0.007104 0.006567 -0.008684 0.013965 0.001269 -0.005471 0.002193 -0.009645 0.009662 -0.000311 -0.001989 -0.023031 0.017104 0.013742
number_project -0.142970 0.349333 1.000000 0.417211 0.196786 -0.004741 0.023787 -0.006064 0.009672 0.003287 0.009703 0.004189 -0.027356 0.009728 -0.023064 0.000829 -0.013388 0.000303 0.028596
average_monthly_hours -0.020048 0.339742 0.417211 1.000000 0.127755 -0.010143 0.071287 -0.003544 0.007082 0.006967 -0.001177 0.000524 -0.010783 0.000834 -0.008210 -0.005494 -0.001718 -0.002444 0.013638
time_spend_company -0.100866 0.131591 0.196786 0.127755 1.000000 0.002120 0.144822 0.067433 -0.003086 -0.006053 -0.021116 0.003909 -0.022194 0.115436 0.012096 -0.003919 0.015150 -0.030111 -0.027991
Work_accident 0.058697 -0.007104 -0.004741 -0.010143 0.002120 1.000000 -0.154622 0.039245 -0.002506 -0.009293 0.017167 -0.012836 -0.015649 0.011242 0.011367 0.001246 -0.004955 0.012079 -0.006070
left -0.388375 0.006567 0.023787 0.071287 0.144822 -0.154622 1.000000 -0.061788 -0.001294 -0.010925 -0.046596 0.015201 0.028249 -0.046035 -0.000859 -0.011029 0.009923 0.010700 0.020076
promotion_last_5years 0.025605 -0.008684 -0.006064 -0.003544 0.067433 0.039245 -0.061788 1.000000 -0.001318 -0.038942 0.021268 -0.004852 -0.001531 0.128087 0.049253 -0.037288 0.012353 -0.035605 -0.035799
salary 0.011754 0.013965 0.009672 0.007082 -0.003086 -0.002506 -0.001294 -0.001318 1.000000 0.010058 0.022783 -0.002598 0.028232 -0.116202 -0.000906 0.000782 0.015544 0.008054 0.000379
department_IT 0.006373 0.001269 0.003287 0.006967 -0.006053 -0.009293 -0.010925 -0.038942 0.010058 1.000000 -0.070240 -0.069293 -0.067949 -0.062500 -0.073524 -0.075503 -0.184302 -0.124705 -0.140484
department_RandD 0.006615 -0.005471 0.009703 -0.001177 -0.021116 0.017167 -0.046596 0.021268 0.022783 -0.070240 1.000000 -0.054629 -0.053570 -0.049274 -0.057965 -0.059525 -0.145300 -0.098315 -0.110755
department_accounting -0.028649 0.002193 0.004189 0.000524 0.003909 -0.012836 0.015201 -0.004852 -0.002598 -0.069293 -0.054629 1.000000 -0.052848 -0.048610 -0.057183 -0.058723 -0.143341 -0.096989 -0.109262
department_hr -0.012841 -0.009645 -0.027356 -0.010783 -0.022194 -0.015649 0.028249 -0.001531 0.028232 -0.067949 -0.053570 -0.052848 1.000000 -0.047667 -0.056075 -0.057584 -0.140562 -0.095109 -0.107143
department_management 0.007172 0.009662 0.009728 0.000834 0.115436 0.011242 -0.046035 0.128087 -0.116202 -0.062500 -0.049274 -0.048610 -0.047667 1.000000 -0.051578 -0.052966 -0.129289 -0.087482 -0.098551
department_marketing 0.005715 -0.000311 -0.023064 -0.008210 0.012096 0.011367 -0.000859 0.049253 -0.000906 -0.073524 -0.057965 -0.057183 -0.056075 -0.051578 1.000000 -0.062308 -0.152093 -0.102911 -0.115933
department_product_mng 0.006919 -0.001989 0.000829 -0.005494 -0.003919 0.001246 -0.011029 -0.037288 0.000782 -0.075503 -0.059525 -0.058723 -0.057584 -0.052966 -0.062308 1.000000 -0.156187 -0.105682 -0.119054
department_sales 0.004007 -0.023031 -0.013388 -0.001718 0.015150 -0.004955 0.009923 0.012353 0.015544 -0.184302 -0.145300 -0.143341 -0.140562 -0.129289 -0.152093 -0.156187 1.000000 -0.257967 -0.290608
department_support 0.009185 0.017104 0.000303 -0.002444 -0.030111 0.012079 0.010700 -0.035605 0.008054 -0.124705 -0.098315 -0.096989 -0.095109 -0.087482 -0.102911 -0.105682 -0.257967 1.000000 -0.196636
department_technical -0.009345 0.013742 0.028596 0.013638 -0.027991 -0.006070 0.020076 -0.035799 0.000379 -0.140484 -0.110755 -0.109262 -0.107143 -0.098551 -0.115933 -0.119054 -0.290608 -0.196636 1.000000

Analysis

In [12]:
# we can now create a correlation matrix
fig, axes = plt.subplots(1, 1, figsize=(18, 10))

sns.heatmap(hr_dataset_encoded.corr(), annot=True).set_title('Correlation matrix for HR dataset')
plt.show()
In [13]:
# the variables ranked by absolute value

hr_dataset_encoded.corr()['left'].reset_index().rename(columns = {'index': 'variable', 'left':'left correlation'})\
.loc[hr_dataset_encoded.corr()['left'].reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]
Out[13]:
variable left correlation
0 satisfaction_level -0.388375
5 Work_accident -0.154622
4 time_spend_company 0.144822
3 average_monthly_hours 0.071287
7 promotion_last_5years -0.061788
10 department_RandD -0.046596
13 department_management -0.046035
12 department_hr 0.028249
2 number_project 0.023787
18 department_technical 0.020076
In [14]:
hr_dataset.groupby(['left'])['left'].count().reset_index(name = 'Employee count')
Out[14]:
left Employee count
0 0 11428
1 1 3571
In [15]:
sns.countplot(x="left", data=hr_dataset)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae07c3dc0>
In [16]:
fig, ax = plt.subplots()
fig.set_size_inches(5,5)
model_bar = sns.countplot(x="left", data = hr_dataset)
plt.title("How many employees have stayed and left") 

for p in model_bar.patches:
    model_bar.annotate(p.get_height(), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   size=15,
                   xytext = (0, -35), 
                   textcoords = 'offset points')
In [ ]:
 
In [17]:
time_spend_count = hr_dataset.groupby(['time_spend_company','left'])['left'].count().reset_index(name = 'Employee count')
time_spend_count
Out[17]:
time_spend_company left Employee count
0 2 0 3191
1 2 1 53
2 3 0 4857
3 3 1 1586
4 4 0 1667
5 4 1 890
6 5 0 640
7 5 1 833
8 6 0 509
9 6 1 209
10 7 0 188
11 8 0 162
12 10 0 214
In [18]:
sns.barplot(x = 'time_spend_company', y = 'Employee count', data = time_spend_count, hue = 'left')
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae0b6f430>
In [19]:
number_project = hr_dataset.groupby(['number_project','left'])['left'].count().reset_index(name = 'Employee count')
number_project
Out[19]:
number_project left Employee count
0 2 0 821
1 2 1 1567
2 3 0 3983
3 3 1 72
4 4 0 3956
5 4 1 409
6 5 0 2149
7 5 1 612
8 6 0 519
9 6 1 655
10 7 1 256
In [20]:
sns.barplot(x = 'number_project', y = 'Employee count', data = number_project, hue = 'left')
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae0c222b0>
In [21]:
# we can see those on 2,6,7 project counts are more likely to leave than stay
In [22]:
salary_count = hr_dataset.groupby(['salary','left'])['left'].count().reset_index(name = 'Employee count')
salary_count
Out[22]:
salary left Employee count
0 high 0 1155
1 high 1 82
2 low 0 5144
3 low 1 2172
4 medium 0 5129
5 medium 1 1317
In [23]:
sns.barplot(x = 'salary', y = 'Employee count', data = salary_count, hue = 'left')
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae0b2d9d0>
In [24]:
px.sunburst(salary_count, path = ['salary','left'], values='Employee count')
In [ ]:
# low has highest churn rate, and high has lowest churn rate
In [25]:
# Lets make a pie chart to representing department employee splits

import plotly.express as px

pie = px.pie(hr_dataset, values= hr_dataset['department'].value_counts() , names= hr_dataset['department'].value_counts().index, title='Employee pie chart by department',labels={'value':'employees'})
pie.update_traces(textposition='inside', textinfo='percent+label')
pie.show()
In [26]:
department_count = hr_dataset.groupby(['department','left'])['left'].count().reset_index(name = 'Employee count')
department_count
Out[26]:
department left Employee count
0 IT 0 954
1 IT 1 273
2 RandD 0 666
3 RandD 1 121
4 accounting 0 563
5 accounting 1 204
6 hr 0 524
7 hr 1 215
8 management 0 539
9 management 1 91
10 marketing 0 655
11 marketing 1 203
12 product_mng 0 704
13 product_mng 1 198
14 sales 0 3126
15 sales 1 1014
16 support 0 1674
17 support 1 555
18 technical 0 2023
19 technical 1 697
In [ ]:
 
In [28]:
fig, ax = plt.subplots()
fig.set_size_inches(20,10)
model_bar = sns.barplot(x = 'department', y = 'Employee count', data = department_count, hue = 'left')
plt.title("How many employees have stayed and left by department") 

for p in model_bar.patches:
    model_bar.annotate('{:,.0f}'.format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   size=15,
                   xytext = (0, -10), 
                   textcoords = 'offset points')
In [ ]:
# indeed we conclude accounting, hr and technical have the highest leaving rates.
# we can see most people who left came from sales follwed by technical, however this is to be expected as these are the largest departmets
# the worst employee left rate is at hr
In [29]:
# lets find the highest leaving rates within departments

percentage_split_departments = hr_dataset.groupby(['department','left'])['left'].count().groupby(level=0).apply(lambda x: 100*x/x.sum())\
.reset_index(name='Leaving percentage').round(2)
percentage_split_departments[percentage_split_departments['left']==1].sort_values(by =['Leaving percentage'],ascending = False)

# hr has the highest leaving rate
Out[29]:
department left Leaving percentage
7 hr 1 29.09
5 accounting 1 26.60
19 technical 1 25.62
17 support 1 24.90
15 sales 1 24.49
11 marketing 1 23.66
1 IT 1 22.25
13 product_mng 1 21.95
3 RandD 1 15.37
9 management 1 14.44
In [ ]:
 
In [30]:
box = px.box(hr_dataset, x="left", y="satisfaction_level", color = "left", color_discrete_sequence=px.colors.qualitative.Set1)
box.show()

# clearly those who left we're more unsatsified at work, median 0.41 compared to 0.69
In [75]:
pd.crosstab(hr_dataset['satisfaction_level'],hr_dataset['left']).plot(kind="bar",figsize=(20,5))
plt.title('Employee churn count against Satisfaction level')
plt.xlabel('Satisfaction level ')
plt.ylabel('Count')
plt.show()
In [31]:
fig, axes = plt.subplots(3, 3, figsize=(18, 10))

fig.suptitle('Satisfaction level stripplots')

sns.stripplot(ax=axes[0, 0], data=hr_dataset[hr_dataset['left']==1], x='left', y='satisfaction_level', color = 'r')
sns.stripplot(ax=axes[0, 1], data=hr_dataset[hr_dataset['left']==1], x='last_evaluation', y='satisfaction_level')
sns.stripplot(ax=axes[0, 2], data=hr_dataset[hr_dataset['left']==1], x='number_project', y='satisfaction_level')
sns.stripplot(ax=axes[1, 0], data=hr_dataset[hr_dataset['left']==1], x='average_monthly_hours', y='satisfaction_level')
sns.stripplot(ax=axes[1, 1], data=hr_dataset[hr_dataset['left']==1], x='time_spend_company', y='satisfaction_level')
sns.stripplot(ax=axes[1, 2], data=hr_dataset[hr_dataset['left']==1], x='Work_accident', y='satisfaction_level')
sns.stripplot(ax=axes[2, 0], data=hr_dataset[hr_dataset['left']==1], x='promotion_last_5years', y='satisfaction_level')
sns.stripplot(ax=axes[2, 1], data=hr_dataset[hr_dataset['left']==1], x='department', y='satisfaction_level')
sns.stripplot(ax=axes[2, 2], data=hr_dataset[hr_dataset['left']==1], x='salary', y='satisfaction_level')
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae27d3ac0>
In [ ]:
 
In [ ]:
# from this we are able to see there are  3 groups the client must focus on to avoid losing them

# first cluster left due to extremely unsatisfied, seemingly from being overworked at with 5+ projects and higher average hours than the others
# these guys tend to be around 4/5 years with the company and have not been promoted despite their hard work
# they have a high evaluation score, indeeed the company is overworking great employees

# 2nd cluster aren't so happy, with a lot of them having lower average monthly hours than others
# they have low evaluation scores, many are on their second project and have been with the copmany 3/4 years
# perhpas they feel the lack of growth and the lack of hours isnt paying them enough


# 3rd cluster are very satisfied, so it's interesting to learn why they have left
# they had very high evaluation scores, were around 4/5 projects in
# they have a fairer amount of average monthly values, as compared the cluster 1 who were simialr but unsatsified
# but they were underpayed, with many of them low or medium salary suggesting they left to a company who payed more
In [32]:
# lets look further at these 3 groups
# the first cluster is below 0.15 in satisfaction level

cluster = hr_dataset[(hr_dataset['satisfaction_level']<0.15)]
cluster
Out[32]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years department salary
2 0.11 0.88 7 272 4 0 1 0 sales medium
6 0.10 0.77 6 247 4 0 1 0 sales low
11 0.11 0.81 6 305 4 0 1 0 sales low
20 0.11 0.83 6 282 4 0 1 0 sales low
22 0.09 0.95 6 304 4 0 1 0 sales low
... ... ... ... ... ... ... ... ... ... ...
14972 0.11 0.97 6 310 4 0 1 0 accounting medium
14975 0.10 0.79 7 310 4 0 1 0 hr medium
14979 0.09 0.93 6 296 4 0 1 0 technical medium
14991 0.09 0.81 6 257 4 0 1 0 support low
14997 0.11 0.96 6 280 4 0 1 0 support low

1045 rows × 10 columns

In [33]:
# the average number of projects here is 5.9, more than the 3.8 company average
# the average monthly hours here is 266.00, more than the 200.1 monthly average of other employees

cluster.describe().iloc[:,[2,3]]
Out[33]:
number_project average_monthly_hours
count 1045.000000 1045.000000
mean 5.935885 265.996172
std 0.937053 37.860881
min 2.000000 100.000000
25% 6.000000 254.000000
50% 6.000000 273.000000
75% 6.000000 291.000000
max 7.000000 310.000000
In [34]:
hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']<0.15)].corr()['left'].reset_index()\
.rename(columns = {'index': 'variable', 'left':'left correlation'})\
.loc[hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']<0.15)]\
.corr()['left'].reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]

# (I created a dataframe of the ranked correlations with left by absolute values)
# these include the departments and salaries now as well, though they do not seem to be that important anyway to reasons for leaving
# within this region of employees, we see the reason many left is due to a low satisfaction level resulting from higher than\
# average monthly hours and having a high number of projects, which is what we saw before
Out[34]:
variable left correlation
0 satisfaction_level -0.770683
3 average_monthly_hours 0.629428
2 number_project 0.620977
1 last_evaluation 0.524160
4 time_spend_company -0.323640
5 Work_accident -0.158207
16 department_sales -0.055669
8 salary 0.042567
13 department_management -0.036086
9 department_IT 0.030156
In [ ]:
# clearly working them less hours and projects would make them happier here
In [35]:
fig, axes = plt.subplots(2, 2, figsize=(18, 10))

fig.suptitle('Countplots of employees in region 1')

sns.countplot(ax=axes[0, 0],x="left", data=cluster)
sns.countplot(ax=axes[0, 1],x="number_project", data=cluster, hue = 'left')
sns.countplot(ax=axes[1, 0],x="average_monthly_hours", data=cluster, hue = 'left')
sns.countplot(ax=axes[1, 1],x="time_spend_company", data=cluster, hue = 'left')
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae27f4370>
In [36]:
# now we look at the second cluster group between 0.35 and 0.5 satisfaction levels

cluster2 = hr_dataset[(hr_dataset['satisfaction_level']<0.5)&(hr_dataset['satisfaction_level']>0.35)]
cluster2
Out[36]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years department salary
0 0.38 0.53 2 157 3 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low
5 0.41 0.50 2 153 3 0 1 0 sales low
9 0.42 0.53 2 142 3 0 1 0 sales low
10 0.45 0.54 2 135 3 0 1 0 sales low
... ... ... ... ... ... ... ... ... ... ...
14992 0.40 0.48 2 155 3 0 1 0 support low
14994 0.40 0.57 2 151 3 0 1 0 support low
14995 0.37 0.48 2 160 3 0 1 0 support low
14996 0.37 0.53 2 143 3 0 1 0 support low
14998 0.37 0.52 2 158 3 0 1 0 support low

2412 rows × 10 columns

In [37]:
hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']<0.5)&(hr_dataset_encoded['satisfaction_level']>0.35)].corr()['left']\
.reset_index().rename(columns = {'index': 'variable', 'left':'left correlation'}) \
.loc[hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']<0.5)&(hr_dataset_encoded['satisfaction_level']>0.35)]\
.corr()['left'].reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]
Out[37]:
variable left correlation
2 number_project -0.689441
1 last_evaluation -0.491512
3 average_monthly_hours -0.476644
0 satisfaction_level -0.422229
4 time_spend_company -0.246803
5 Work_accident -0.204257
7 promotion_last_5years -0.084856
10 department_RandD -0.081256
13 department_management -0.067454
15 department_product_mng -0.057291
In [ ]:
# perhaps the lack of growth and low evaluation scores at around 3 year mark suggested these lower performing ex employees\
# sensed no progress at the company or potentially were even let go 
In [38]:
fig, axes = plt.subplots(2, 2, figsize=(18, 10))

fig.suptitle('Countplots of employees in region 2')

sns.countplot(ax=axes[0, 0],x="left", data=cluster2)
sns.countplot(ax=axes[0, 1],x="number_project", data=cluster2, hue = 'left')
sns.countplot(ax=axes[1, 0],x="average_monthly_hours", data=cluster2, hue = 'left')
sns.countplot(ax=axes[1, 1],x="time_spend_company", data=cluster2, hue = 'left')
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae4b05130>
In [39]:
cluster3 = hr_dataset[(hr_dataset['satisfaction_level']> 0.7)]
cluster3
Out[39]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years department salary
1 0.80 0.86 5 262 6 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
7 0.92 0.85 5 259 5 0 1 0 sales low
8 0.89 1.00 5 224 5 0 1 0 sales low
12 0.84 0.92 4 234 5 0 1 0 sales low
... ... ... ... ... ... ... ... ... ... ...
14985 0.91 0.99 5 254 5 0 1 0 technical medium
14986 0.85 0.85 4 247 6 0 1 0 technical low
14987 0.90 0.70 5 206 4 0 1 0 technical low
14990 0.89 0.88 5 228 5 1 1 0 support low
14993 0.76 0.83 6 293 6 0 1 0 support low

6298 rows × 10 columns

In [40]:
hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']> 0.7)].corr()['left']\
.reset_index().rename(columns = {'index': 'variable', 'left':'left correlation'})\
.loc[hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']> 0.7)].corr()['left']\
.reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]
Out[40]:
variable left correlation
4 time_spend_company 0.435619
1 last_evaluation 0.403203
3 average_monthly_hours 0.334504
2 number_project 0.303909
0 satisfaction_level -0.134416
5 Work_accident -0.121768
7 promotion_last_5years -0.053413
13 department_management -0.036722
10 department_RandD -0.031125
17 department_support 0.020701
In [ ]:
# for this dataset we see time_spend_company is the main reason they moved on
# interestingly this group was very happy and high performing
In [41]:
fig, axes = plt.subplots(2, 2, figsize=(18, 10))

fig.suptitle('Countplots of employees in region 3')

sns.countplot(ax=axes[0, 0],x="left", data=cluster3)
sns.countplot(ax=axes[0, 1],x="number_project", data=cluster3, hue = 'left')
sns.countplot(ax=axes[1, 0],x="last_evaluation", data=cluster3, hue = 'left')
sns.countplot(ax=axes[1, 1],x="time_spend_company", data=cluster3, hue = 'left')
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae36a1d00>
In [42]:
# lets look at the spike in plot 3 to better understand
In [43]:
cluster4 = hr_dataset[(hr_dataset['last_evaluation']> 0.99)]
cluster4
Out[43]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years department salary
8 0.89 1.0 5 224 5 0 1 0 sales low
59 0.85 1.0 4 225 5 0 1 0 technical low
91 0.89 1.0 5 246 5 0 1 0 sales low
99 0.90 1.0 5 221 6 0 1 0 sales medium
106 0.91 1.0 4 257 5 0 1 0 accounting medium
... ... ... ... ... ... ... ... ... ... ...
14782 0.73 1.0 4 252 5 0 1 0 technical medium
14866 0.77 1.0 4 232 5 0 1 0 technical medium
14877 0.84 1.0 5 242 5 0 1 0 sales low
14896 0.75 1.0 5 223 6 0 1 0 accounting medium
14959 0.59 1.0 2 155 5 0 1 0 sales low

283 rows × 10 columns

In [44]:
sns.countplot(x="left", data=cluster4)
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae4ba81f0>
In [45]:
sns.countplot(x="salary", data=cluster4, hue = 'left')
Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x26ae56fbcd0>
In [46]:
# from these plots we gather no one with a high salary left in this cluster and with a evaluation score of 1

Predictive modelling

In [49]:
hr_dataset_encoded
Out[49]:
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years salary department_IT department_RandD department_accounting department_hr department_management department_marketing department_product_mng department_sales department_support department_technical
0 0.38 0.53 2 157 3 0 1 0 1 0 0 0 0 0 0 0 1 0 0
1 0.80 0.86 5 262 6 0 1 0 2 0 0 0 0 0 0 0 1 0 0
2 0.11 0.88 7 272 4 0 1 0 2 0 0 0 0 0 0 0 1 0 0
3 0.72 0.87 5 223 5 0 1 0 1 0 0 0 0 0 0 0 1 0 0
4 0.37 0.52 2 159 3 0 1 0 1 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14994 0.40 0.57 2 151 3 0 1 0 1 0 0 0 0 0 0 0 0 1 0
14995 0.37 0.48 2 160 3 0 1 0 1 0 0 0 0 0 0 0 0 1 0
14996 0.37 0.53 2 143 3 0 1 0 1 0 0 0 0 0 0 0 0 1 0
14997 0.11 0.96 6 280 4 0 1 0 1 0 0 0 0 0 0 0 0 1 0
14998 0.37 0.52 2 158 3 0 1 0 1 0 0 0 0 0 0 0 0 1 0

14999 rows × 19 columns

In [50]:
X = hr_dataset_encoded.drop(['left'],axis = 1).values
y = hr_dataset_encoded['left'].values
In [51]:
# test train split our model, with a 80:20 split between the training and testing sets
In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
In [53]:
X_train[0] # we can see some variables are above the 0 to 1 range that most of the others are
Out[53]:
array([  0.75,   0.81,   5.  , 227.  ,   5.  ,   0.  ,   0.  ,   2.  ,
         0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
         0.  ,   1.  ])
In [54]:
# lets scale the model now, only number_project, average_monthly_hours, time_spend_company and salary need to be scaled\
# as they are not in a 0 to 1 range
In [55]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train[:,[2,3,4,7]] = sc.fit_transform(X_train[:,[2,3,4,7]])
X_test[:,[2,3,4,7]] = sc.fit_transform(X_test[:,[2,3,4,7]])
In [56]:
X_train[0]
Out[56]:
array([0.75      , 0.81      , 0.96618991, 0.52049956, 1.03337906,
       0.        , 0.        , 1.04398693, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        ])
In [57]:
# we can begin testing
In [58]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)
cm_dtc = confusion_matrix(y_test, y_pred)
print(cm_dtc)
acc_dtc = accuracy_score(y_test, y_pred) * 100
acc_dtc
[[2265   34]
 [  25  676]]
Out[58]:
98.03333333333333
In [59]:
from sklearn.svm import SVC

svc = SVC(kernel = 'linear', random_state = 0)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
cm_svc = confusion_matrix(y_test, y_pred)
print(cm_svc)
acc_svc = accuracy_score(y_test, y_pred)*100
acc_svc
[[2164  135]
 [ 527  174]]
Out[59]:
77.93333333333334
In [60]:
from sklearn.svm import SVC

classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)
               
y_pred = classifier.predict(X_test)
cm_ksvc = confusion_matrix(y_test, y_pred)
print(cm_svc)
acc_ksvc = accuracy_score(y_test, y_pred) *100
acc_ksvc
[[2164  135]
 [ 527  174]]
Out[60]:
95.96666666666667
In [61]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm_lr = confusion_matrix(y_test, y_pred)
print(cm_lr)
acc_lr = accuracy_score(y_test, y_pred)*100
acc_lr
[[2123  176]
 [ 528  173]]
Out[61]:
76.53333333333333
In [62]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
cm_nb = confusion_matrix(y_test, y_pred)
print(cm_nb)
acc_nb = accuracy_score(y_test, y_pred)*100
acc_nb
[[1683  616]
 [ 145  556]]
Out[62]:
74.63333333333333
In [63]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)
cm_rfc = confusion_matrix(y_test, y_pred)
print(cm_rfc)
acc_rfc = accuracy_score(y_test, y_pred)*100
acc_rfc
[[2294    5]
 [  17  684]]
Out[63]:
99.26666666666667
In [64]:
from sklearn.neighbors import KNeighborsClassifier

knc = KNeighborsClassifier(n_neighbors = 1, metric = 'minkowski', p = 2)
knc.fit(X_train, y_train)

y_pred = knc.predict(X_test)
cm_knc = confusion_matrix(y_test, y_pred)
print(cm_knc)
acc_knc = accuracy_score(y_test, y_pred)*100
acc_knc
[[2225   74]
 [  26  675]]
Out[64]:
96.66666666666667
In [65]:
# lets create a dataframe to represent these

values = {'Model': ['Decision Tree','Linear SVC', 'RBF SVC', 'Logistic Regression', 'Naive Bayes', ' Random Forest', ' KNeighbors'],
        'Accuracy %': [acc_dtc,acc_svc,acc_ksvc,acc_lr,acc_nb,acc_rfc,acc_knc]}

model_accuracy_df = pd.DataFrame(values, columns = ['Model', 'Accuracy %'])
model_accuracy_df['Accuracy %'] = model_accuracy_df['Accuracy %'].round(2)
model_accuracy_df
Out[65]:
Model Accuracy %
0 Decision Tree 98.03
1 Linear SVC 77.93
2 RBF SVC 95.97
3 Logistic Regression 76.53
4 Naive Bayes 74.63
5 Random Forest 99.27
6 KNeighbors 96.67
In [66]:
fig, ax = plt.subplots()
fig.set_size_inches(16,5)
model_bar = sns.barplot(x="Model", y="Accuracy %", data = model_accuracy_df)
plt.title("Accuracies of each model") 

for p in model_bar.patches:
    model_bar.annotate('{:.2f}%'.format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   size=15,
                   xytext = (0, -15), 
                   textcoords = 'offset points')

    
In [67]:
# we see that random forest has the best accuracy
In [68]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

fig.suptitle('Confusion Matrix for each Model')

sns.heatmap(cm_dtc, ax = axes[0,0],annot=True, cbar=False, cmap="Greens",  fmt="d").set_title('Decision Tree Model')
sns.heatmap(cm_svc, ax = axes[0,1], annot=True, cbar=False, cmap="Greens",  fmt="d").set_title('Linear SVC Model')
sns.heatmap(cm_ksvc, ax = axes[0,2],annot=True, cbar=False , cmap="Greens",  fmt="d").set_title('RBF SVC Model')
sns.heatmap(cm_lr, ax = axes[1,0], annot=True, cbar=False, cmap="Greens",  fmt="d").set_title('Linear Regression Model')
sns.heatmap(cm_rfc, ax = axes[1,1], annot=True, cbar=False, cmap="Greens", fmt="d").set_title('Random Forest Model')
sns.heatmap(cm_knc, ax = axes[1,2], annot=True, cbar=False, cmap="Greens", fmt="d").set_title('KNeighbours Model')
Out[68]:
Text(0.5, 1.0, 'KNeighbours Model')
In [71]:
# we find random forest to be the best model, with an accuracy of 99.27% after tuning the hyperparameters
In [70]:
sns.heatmap(cm_rfc, annot=True, cbar=False, cmap="Greens", fmt="d").set_title('Random Forest Confusion Matrix')
Out[70]:
Text(0.5, 1.0, 'Random Forest Confusion Matrix')
In [ ]:
hr_dataset_analysis.to_csv('hr_analysis.csv')
In [ ]:
 
In [ ]: